R Markdown

library(arsenal)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(here)
## here() starts at /Users/stephcopeland/Documents/GitHub/222 Final Project/last take
library(tidyr)
library(DescTools)
## 
## Attaching package: 'DescTools'
## The following objects are masked from 'package:arsenal':
## 
##     %nin%, N
library(tibble)
library(calecopal)
library(ggeffects)
library(gt)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(viridis)
## Loading required package: viridisLite
library(hrbrthemes)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
##       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
##       if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
data1 <- read.csv("last_take.csv")
head(data1)
##       country code area_sqkm population GDP_capita        dalys
## 1 Afghanistan  AFG    652860   36296111   519.8889 2.544699e+05
## 2      Angola  AGO   1246700   29816769  4095.8101 8.006665e+05
## 3     Albania  ALB     27400    2873457  4531.0208 1.543584e+02
## 4     Andorra  AND       470      76997 38964.9045 3.942648e-01
## 5   Argentina  ARG   2736690   44044811 14613.0418 2.894639e+04
## 6     Armenia  ARM     28470    2944789  3914.5279 1.241430e+03
##   tree_cover_loss_ha
## 1           25.70585
## 2       280456.32110
## 3         1689.02503
## 4            3.42216
## 5       233919.67780
## 6           41.74732
data2 <- data1 %>% 
  mutate(prop_tree_loss = (tree_cover_loss_ha/area_sqkm)) %>% 
  mutate(prop_dalys = (dalys/population))
head(data2)
##       country code area_sqkm population GDP_capita        dalys
## 1 Afghanistan  AFG    652860   36296111   519.8889 2.544699e+05
## 2      Angola  AGO   1246700   29816769  4095.8101 8.006665e+05
## 3     Albania  ALB     27400    2873457  4531.0208 1.543584e+02
## 4     Andorra  AND       470      76997 38964.9045 3.942648e-01
## 5   Argentina  ARG   2736690   44044811 14613.0418 2.894639e+04
## 6     Armenia  ARM     28470    2944789  3914.5279 1.241430e+03
##   tree_cover_loss_ha prop_tree_loss   prop_dalys
## 1           25.70585   3.937421e-05 7.010940e-03
## 2       280456.32110   2.249589e-01 2.685289e-02
## 3         1689.02503   6.164325e-02 5.371872e-05
## 4            3.42216   7.281192e-03 5.120522e-06
## 5       233919.67780   8.547540e-02 6.572033e-04
## 6           41.74732   1.466362e-03 4.215684e-04
data3 <- data2 %>% 
  mutate(per_tree_loss = (prop_tree_loss * 100)) %>% 
  mutate(per_dalys = (prop_dalys * 100))
head(data3)
##       country code area_sqkm population GDP_capita        dalys
## 1 Afghanistan  AFG    652860   36296111   519.8889 2.544699e+05
## 2      Angola  AGO   1246700   29816769  4095.8101 8.006665e+05
## 3     Albania  ALB     27400    2873457  4531.0208 1.543584e+02
## 4     Andorra  AND       470      76997 38964.9045 3.942648e-01
## 5   Argentina  ARG   2736690   44044811 14613.0418 2.894639e+04
## 6     Armenia  ARM     28470    2944789  3914.5279 1.241430e+03
##   tree_cover_loss_ha prop_tree_loss   prop_dalys per_tree_loss    per_dalys
## 1           25.70585   3.937421e-05 7.010940e-03   0.003937421 0.7010939935
## 2       280456.32110   2.249589e-01 2.685289e-02  22.495894850 2.6852892837
## 3         1689.02503   6.164325e-02 5.371872e-05   6.164324938 0.0053718717
## 4            3.42216   7.281192e-03 5.120522e-06   0.728119233 0.0005120522
## 5       233919.67780   8.547540e-02 6.572033e-04   8.547540196 0.0657203287
## 6           41.74732   1.466362e-03 4.215684e-04   0.146636171 0.0421568401
ggplot(data3, aes(sample = prop_dalys)) +
  geom_qq() + geom_qq_line()
## Warning: Removed 15 rows containing non-finite values (stat_qq).
## Warning: Removed 15 rows containing non-finite values (stat_qq_line).

ggplot(data3, aes(sample = dalys)) +
  geom_qq() + geom_qq_line()
## Warning: Removed 13 rows containing non-finite values (stat_qq).
## Warning: Removed 13 rows containing non-finite values (stat_qq_line).

data3 <- data3 %>% 
  mutate(logdalys = log(dalys))
ggplot(data3, aes(sample = logdalys)) +
  geom_qq() + geom_qq_line()
## Warning: Removed 13 rows containing non-finite values (stat_qq).
## Warning: Removed 13 rows containing non-finite values (stat_qq_line).

data4 <- data3 %>% 
  mutate(log_dalys = log(prop_dalys))
head(data4)
##       country code area_sqkm population GDP_capita        dalys
## 1 Afghanistan  AFG    652860   36296111   519.8889 2.544699e+05
## 2      Angola  AGO   1246700   29816769  4095.8101 8.006665e+05
## 3     Albania  ALB     27400    2873457  4531.0208 1.543584e+02
## 4     Andorra  AND       470      76997 38964.9045 3.942648e-01
## 5   Argentina  ARG   2736690   44044811 14613.0418 2.894639e+04
## 6     Armenia  ARM     28470    2944789  3914.5279 1.241430e+03
##   tree_cover_loss_ha prop_tree_loss   prop_dalys per_tree_loss    per_dalys
## 1           25.70585   3.937421e-05 7.010940e-03   0.003937421 0.7010939935
## 2       280456.32110   2.249589e-01 2.685289e-02  22.495894850 2.6852892837
## 3         1689.02503   6.164325e-02 5.371872e-05   6.164324938 0.0053718717
## 4            3.42216   7.281192e-03 5.120522e-06   0.728119233 0.0005120522
## 5       233919.67780   8.547540e-02 6.572033e-04   8.547540196 0.0657203287
## 6           41.74732   1.466362e-03 4.215684e-04   0.146636171 0.0421568401
##     logdalys  log_dalys
## 1 12.4469377  -4.960284
## 2 13.5931998  -3.617382
## 3  5.0392773  -9.831749
## 4 -0.9307325 -12.182254
## 5 10.2732009  -7.327517
## 6  7.1240192  -7.771529
ggplot(data4, aes(sample = log_dalys)) +
  geom_qq() + geom_qq_line()
## Warning: Removed 15 rows containing non-finite values (stat_qq).
## Warning: Removed 15 rows containing non-finite values (stat_qq_line).

data4 <- data4 %>% 
  mutate(log_GDP = log(GDP_capita))
mod1 <- lm(logdalys ~ prop_tree_loss, data = data4)

summary(mod1)
## 
## Call:
## lm(formula = logdalys ~ prop_tree_loss, data = data4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -10.059  -2.194   0.059   2.519   7.314 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      9.1297     0.2579  35.397   <2e-16 ***
## prop_tree_loss  -0.1450     0.1040  -1.395    0.165    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.343 on 174 degrees of freedom
##   (13 observations deleted due to missingness)
## Multiple R-squared:  0.01105,    Adjusted R-squared:  0.00537 
## F-statistic: 1.945 on 1 and 174 DF,  p-value: 0.1649
ggplot(data4, aes(y = log_dalys, x = prop_tree_loss)) +
  geom_point() +
  stat_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 15 rows containing non-finite values (stat_smooth).
## Warning: Removed 15 rows containing missing values (geom_point).

data4 <- data4 %>% 
  mutate(win_prop_tree = Winsorize(prop_tree_loss, minval = NULL, maxval = NULL, probs = c(0.00, 0.95), na.rm = TRUE, type = 9))
graph1 <- ggplot(data4, aes(y = log_dalys, x = win_prop_tree)) +
  geom_point() +
  stat_smooth(method = "lm", se = FALSE)
print(graph1)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 15 rows containing non-finite values (stat_smooth).
## Warning: Removed 15 rows containing missing values (geom_point).

mod2 <- lm(logdalys ~ win_prop_tree, data = data4)

summary(mod2)
## 
## Call:
## lm(formula = logdalys ~ win_prop_tree, data = data4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.8125 -2.1737 -0.1597  2.6571  7.4237 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     8.8773     0.3137  28.297   <2e-16 ***
## win_prop_tree   0.6133     0.6480   0.946    0.345    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.353 on 174 degrees of freedom
##   (13 observations deleted due to missingness)
## Multiple R-squared:  0.005121,   Adjusted R-squared:  -0.0005966 
## F-statistic: 0.8957 on 1 and 174 DF,  p-value: 0.3453
names(mod2)
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "na.action"     "xlevels"       "call"          "terms"        
## [13] "model"
graph2 <- ggplot(data4, aes(y = log_dalys, x = win_prop_tree, color = log_GDP)) +
  geom_point(aes(size = 0.5)) +
  stat_smooth(method = "lm", se = FALSE)
print(graph2)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 15 rows containing non-finite values (stat_smooth).
## Warning: Removed 15 rows containing missing values (geom_point).

mod3 <- lm(log_dalys ~ win_prop_tree + log_GDP, data = data4)

summary(mod3)
## 
## Call:
## lm(formula = log_dalys ~ win_prop_tree + log_GDP, data = data4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.1512 -0.9453  0.2060  0.9732  4.7871 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    4.69968    0.78175   6.012 1.09e-08 ***
## win_prop_tree  0.71536    0.32810   2.180   0.0306 *  
## log_GDP       -1.36044    0.08909 -15.271  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.672 on 170 degrees of freedom
##   (16 observations deleted due to missingness)
## Multiple R-squared:  0.5832, Adjusted R-squared:  0.5783 
## F-statistic: 118.9 on 2 and 170 DF,  p-value: < 2.2e-16
names(mod3)
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "na.action"     "xlevels"       "call"          "terms"        
## [13] "model"
graph3 <- ggplot(data4, aes(y = log_dalys, x = log_GDP, color = win_prop_tree)) +
  geom_point(aes(size = 0.5)) +
  stat_smooth(method = "lm", se = FALSE)
print(graph3)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 16 rows containing non-finite values (stat_smooth).
## Warning: Removed 16 rows containing missing values (geom_point).

data5 <- data4 %>% 
  mutate(country_income_level = 
           case_when(GDP_capita >= 13000 ~ "high", 
                     GDP_capita >= 4000 ~ "middle", 
                     GDP_capita <= 4000 ~ "low"))
data5 %>% 
  group_by(country_income_level) %>% 
  summarize(n = n())
## # A tibble: 4 × 2
##   country_income_level     n
##   <chr>                <int>
## 1 high                    52
## 2 low                     72
## 3 middle                  51
## 4 <NA>                    14
equLOWcountries <- c("Burundi", "Benin", "Bhangladesh", "Bolivia", "Central African Republic", "Cote d'Ivoire", "Cameroon", "Democratic Republic of Congo", "Congo", "Comoros", "Ethiopia", "Ghana", "Guinea", "Gambia", "Honduras", "Haiti", "Indonesia", "Kenya", "Cambodia", "Laos", "Liberia", "Madagascar", "Mali", "Myanmar", "Mozambique",
"Mauritania", "Malawi", "Nigeria", "Nicaragua", "Phillipines", "Rwanda", "Senegal", "Sierra Leone", "El Salvador", "Eswatini", "Togo", "Tanzania", "Uganda", "Vietnam", "Zambia", "Zimbabwe")
noneqLOWcountries <- c("AFG", "ARM", "BTN", "CPV", "EGY", "ERI", "FSM", "GNB", "IND", "KGZ", "LSO", "MAR", "MDA", "MNG", "NER", "NPL", "PAK", "PNG", "SDN", "SLB", "SSD", "STP", "SYR", "TCD", "TJK", "TLS", "TUN", "UKR", "UZB", "VUT") 
data6 <- data5 %>% 
  filter(country_income_level == "low") %>% 
  filter(!(code %in% c(noneqLOWcountries)))

mod3 <- lm(log_dalys ~ win_prop_tree + log_GDP, data = data4)

mod4 <- lm(log_dalys ~ win_prop_tree, data = data6)

summary(mod4)
## 
## Call:
## lm(formula = log_dalys ~ win_prop_tree, data = data6)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -2.499 -1.321  0.180  1.315  2.359 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -3.9986     0.3069 -13.030  5.6e-16 ***
## win_prop_tree  -0.3044     0.4773  -0.638    0.527    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.417 on 40 degrees of freedom
## Multiple R-squared:  0.01007,    Adjusted R-squared:  -0.01468 
## F-statistic: 0.4068 on 1 and 40 DF,  p-value: 0.5272
ggplot(data6, aes(y = log_dalys, x = win_prop_tree)) +
  geom_point(aes(color = GDP_capita, size = 1.0)) +
  stat_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

mod5 <- lm(log_dalys ~ win_prop_tree + log_GDP, data = data6)

summary(mod5)
## 
## Call:
## lm(formula = log_dalys ~ win_prop_tree + log_GDP, data = data6)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0354 -0.7776 -0.2154  0.8573  2.1058 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    4.89557    1.78369   2.745  0.00911 ** 
## win_prop_tree -0.01454    0.38072  -0.038  0.96973    
## log_GDP       -1.27217    0.25277  -5.033 1.13e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.117 on 39 degrees of freedom
## Multiple R-squared:  0.3999, Adjusted R-squared:  0.3691 
## F-statistic: 12.99 on 2 and 39 DF,  p-value: 4.743e-05
ggplot(data6, aes(y = log_dalys, x = GDP_capita)) +
  geom_point(aes(color = win_prop_tree, size = 2)) +
  stat_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

# Interactive version
p <- data5 %>%
  mutate(GDP_capita=round(GDP_capita,0)) %>%
  mutate(population=round(population/1000000,2)) %>%
  mutate(log_dalys=round(log_dalys,1)) %>%
  
  # Reorder countries to having big bubbles on top
  arrange(desc(population)) %>%
  #mutate(country = factor(country, country)) %>%
  
  # prepare text for tooltip
  mutate(text = paste("Country: ", country, "\nPopulation (M): ", population, "\n DALYs: ", log_dalys, "\nGdp per capita: ", GDP_capita, sep="")) %>%
  
  # Classic ggplot
  ggplot( aes(x=GDP_capita, y=log_dalys, size = population, color = country_income_level, text=text)) +
    geom_point(alpha=0.5) +
    scale_size(range = c(1.4, 19), name="Global DALYs") +
    scale_color_viridis(discrete=TRUE, guide=FALSE) +
    theme_ipsum() +
    theme(legend.position="none")

pp <- ggplotly(p, tooltip="text")
pp

Hypothesis Testing